home *** CD-ROM | disk | FTP | other *** search
/ Komputer for Alle 2004 #2 / K-CD-2-2004.ISO / OpenOffice Sv / f_0397 / python-core-2.2.2 / lib / markupbase.py < prev    next >
Encoding:
Python Source  |  2003-07-18  |  10.7 KB  |  318 lines

  1. """Shared support for scanning document type declarations in HTML and XHTML."""
  2.  
  3. import re
  4. import string
  5.  
  6. _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
  7. _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
  8.  
  9. del re
  10.  
  11.  
  12. class ParserBase:
  13.     """Parser base class which provides some common support methods used
  14.     by the SGML/HTML and XHTML parsers."""
  15.  
  16.     def __init__(self):
  17.         if self.__class__ is ParserBase:
  18.             raise RuntimeError(
  19.                 "markupbase.ParserBase must be subclassed")
  20.  
  21.     def error(self, message):
  22.         raise NotImplementedError(
  23.             "subclasses of ParserBase must override error()")
  24.  
  25.     def reset(self):
  26.         self.lineno = 1
  27.         self.offset = 0
  28.  
  29.     def getpos(self):
  30.         """Return current line number and offset."""
  31.         return self.lineno, self.offset
  32.  
  33.     # Internal -- update line number and offset.  This should be
  34.     # called for each piece of data exactly once, in order -- in other
  35.     # words the concatenation of all the input strings to this
  36.     # function should be exactly the entire input.
  37.     def updatepos(self, i, j):
  38.         if i >= j:
  39.             return j
  40.         rawdata = self.rawdata
  41.         nlines = string.count(rawdata, "\n", i, j)
  42.         if nlines:
  43.             self.lineno = self.lineno + nlines
  44.             pos = string.rindex(rawdata, "\n", i, j) # Should not fail
  45.             self.offset = j-(pos+1)
  46.         else:
  47.             self.offset = self.offset + j-i
  48.         return j
  49.  
  50.     _decl_otherchars = ''
  51.  
  52.     # Internal -- parse declaration (for use by subclasses).
  53.     def parse_declaration(self, i):
  54.         # This is some sort of declaration; in "HTML as
  55.         # deployed," this should only be the document type
  56.         # declaration ("<!DOCTYPE html...>").
  57.         rawdata = self.rawdata
  58.         j = i + 2
  59.         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
  60.         if rawdata[j:j+1] in ("-", ""):
  61.             # Start of comment followed by buffer boundary,
  62.             # or just a buffer boundary.
  63.             return -1
  64.         # in practice, this should look like: ((name|stringlit) S*)+ '>'
  65.         n = len(rawdata)
  66.         decltype, j = self._scan_name(j, i)
  67.         if j < 0:
  68.             return j
  69.         if decltype == "doctype":
  70.             self._decl_otherchars = ''
  71.         while j < n:
  72.             c = rawdata[j]
  73.             if c == ">":
  74.                 # end of declaration syntax
  75.                 data = rawdata[i+2:j]
  76.                 if decltype == "doctype":
  77.                     self.handle_decl(data)
  78.                 else:
  79.                     self.unknown_decl(data)
  80.                 return j + 1
  81.             if c in "\"'":
  82.                 m = _declstringlit_match(rawdata, j)
  83.                 if not m:
  84.                     return -1 # incomplete
  85.                 j = m.end()
  86.             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
  87.                 name, j = self._scan_name(j, i)
  88.             elif c in self._decl_otherchars:
  89.                 j = j + 1
  90.             elif c == "[":
  91.                 if decltype == "doctype":
  92.                     j = self._parse_doctype_subset(j + 1, i)
  93.                 else:
  94.                     self.error("unexpected '[' char in declaration")
  95.             else:
  96.                 self.error(
  97.                     "unexpected %s char in declaration" % `rawdata[j]`)
  98.             if j < 0:
  99.                 return j
  100.         return -1 # incomplete
  101.  
  102.     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
  103.     # returning the index just past any whitespace following the trailing ']'.
  104.     def _parse_doctype_subset(self, i, declstartpos):
  105.         rawdata = self.rawdata
  106.         n = len(rawdata)
  107.         j = i
  108.         while j < n:
  109.             c = rawdata[j]
  110.             if c == "<":
  111.                 s = rawdata[j:j+2]
  112.                 if s == "<":
  113.                     # end of buffer; incomplete
  114.                     return -1
  115.                 if s != "<!":
  116.                     self.updatepos(declstartpos, j + 1)
  117.                     self.error("unexpected char in internal subset (in %s)"
  118.                                % `s`)
  119.                 if (j + 2) == n:
  120.                     # end of buffer; incomplete
  121.                     return -1
  122.                 if (j + 4) > n:
  123.                     # end of buffer; incomplete
  124.                     return -1
  125.                 if rawdata[j:j+4] == "<!--":
  126.                     j = self.parse_comment(j, report=0)
  127.                     if j < 0:
  128.                         return j
  129.                     continue
  130.                 name, j = self._scan_name(j + 2, declstartpos)
  131.                 if j == -1:
  132.                     return -1
  133.                 if name not in ("attlist", "element", "entity", "notation"):
  134.                     self.updatepos(declstartpos, j + 2)
  135.                     self.error(
  136.                         "unknown declaration %s in internal subset" % `name`)
  137.                 # handle the individual names
  138.                 meth = getattr(self, "_parse_doctype_" + name)
  139.                 j = meth(j, declstartpos)
  140.                 if j < 0:
  141.                     return j
  142.             elif c == "%":
  143.                 # parameter entity reference
  144.                 if (j + 1) == n:
  145.                     # end of buffer; incomplete
  146.                     return -1
  147.                 s, j = self._scan_name(j + 1, declstartpos)
  148.                 if j < 0:
  149.                     return j
  150.                 if rawdata[j] == ";":
  151.                     j = j + 1
  152.             elif c == "]":
  153.                 j = j + 1
  154.                 while j < n and rawdata[j] in string.whitespace:
  155.                     j = j + 1
  156.                 if j < n:
  157.                     if rawdata[j] == ">":
  158.                         return j
  159.                     self.updatepos(declstartpos, j)
  160.                     self.error("unexpected char after internal subset")
  161.                 else:
  162.                     return -1
  163.             elif c in string.whitespace:
  164.                 j = j + 1
  165.             else:
  166.                 self.updatepos(declstartpos, j)
  167.                 self.error("unexpected char %s in internal subset" % `c`)
  168.         # end of buffer reached
  169.         return -1
  170.  
  171.     # Internal -- scan past <!ELEMENT declarations
  172.     def _parse_doctype_element(self, i, declstartpos):
  173.         name, j = self._scan_name(i, declstartpos)
  174.         if j == -1:
  175.             return -1
  176.         # style content model; just skip until '>'
  177.         rawdata = self.rawdata
  178.         if '>' in rawdata[j:]:
  179.             return string.find(rawdata, ">", j) + 1
  180.         return -1
  181.  
  182.     # Internal -- scan past <!ATTLIST declarations
  183.     def _parse_doctype_attlist(self, i, declstartpos):
  184.         rawdata = self.rawdata
  185.         name, j = self._scan_name(i, declstartpos)
  186.         c = rawdata[j:j+1]
  187.         if c == "":
  188.             return -1
  189.         if c == ">":
  190.             return j + 1
  191.         while 1:
  192.             # scan a series of attribute descriptions; simplified:
  193.             #   name type [value] [#constraint]
  194.             name, j = self._scan_name(j, declstartpos)
  195.             if j < 0:
  196.                 return j
  197.             c = rawdata[j:j+1]
  198.             if c == "":
  199.                 return -1
  200.             if c == "(":
  201.                 # an enumerated type; look for ')'
  202.                 if ")" in rawdata[j:]:
  203.                     j = string.find(rawdata, ")", j) + 1
  204.                 else:
  205.                     return -1
  206.                 while rawdata[j:j+1] in string.whitespace:
  207.                     j = j + 1
  208.                 if not rawdata[j:]:
  209.                     # end of buffer, incomplete
  210.                     return -1
  211.             else:
  212.                 name, j = self._scan_name(j, declstartpos)
  213.             c = rawdata[j:j+1]
  214.             if not c:
  215.                 return -1
  216.             if c in "'\"":
  217.                 m = _declstringlit_match(rawdata, j)
  218.                 if m:
  219.                     j = m.end()
  220.                 else:
  221.                     return -1
  222.                 c = rawdata[j:j+1]
  223.                 if not c:
  224.                     return -1
  225.             if c == "#":
  226.                 if rawdata[j:] == "#":
  227.                     # end of buffer
  228.                     return -1
  229.                 name, j = self._scan_name(j + 1, declstartpos)
  230.                 if j < 0:
  231.                     return j
  232.                 c = rawdata[j:j+1]
  233.                 if not c:
  234.                     return -1
  235.             if c == '>':
  236.                 # all done
  237.                 return j + 1
  238.  
  239.     # Internal -- scan past <!NOTATION declarations
  240.     def _parse_doctype_notation(self, i, declstartpos):
  241.         name, j = self._scan_name(i, declstartpos)
  242.         if j < 0:
  243.             return j
  244.         rawdata = self.rawdata
  245.         while 1:
  246.             c = rawdata[j:j+1]
  247.             if not c:
  248.                 # end of buffer; incomplete
  249.                 return -1
  250.             if c == '>':
  251.                 return j + 1
  252.             if c in "'\"":
  253.                 m = _declstringlit_match(rawdata, j)
  254.                 if not m:
  255.                     return -1
  256.                 j = m.end()
  257.             else:
  258.                 name, j = self._scan_name(j, declstartpos)
  259.                 if j < 0:
  260.                     return j
  261.  
  262.     # Internal -- scan past <!ENTITY declarations
  263.     def _parse_doctype_entity(self, i, declstartpos):
  264.         rawdata = self.rawdata
  265.         if rawdata[i:i+1] == "%":
  266.             j = i + 1
  267.             while 1:
  268.                 c = rawdata[j:j+1]
  269.                 if not c:
  270.                     return -1
  271.                 if c in string.whitespace:
  272.                     j = j + 1
  273.                 else:
  274.                     break
  275.         else:
  276.             j = i
  277.         name, j = self._scan_name(j, declstartpos)
  278.         if j < 0:
  279.             return j
  280.         while 1:
  281.             c = self.rawdata[j:j+1]
  282.             if not c:
  283.                 return -1
  284.             if c in "'\"":
  285.                 m = _declstringlit_match(rawdata, j)
  286.                 if m:
  287.                     j = m.end()
  288.                 else:
  289.                     return -1    # incomplete
  290.             elif c == ">":
  291.                 return j + 1
  292.             else:
  293.                 name, j = self._scan_name(j, declstartpos)
  294.                 if j < 0:
  295.                     return j
  296.  
  297.     # Internal -- scan a name token and the new position and the token, or
  298.     # return -1 if we've reached the end of the buffer.
  299.     def _scan_name(self, i, declstartpos):
  300.         rawdata = self.rawdata
  301.         n = len(rawdata)
  302.         if i == n:
  303.             return None, -1
  304.         m = _declname_match(rawdata, i)
  305.         if m:
  306.             s = m.group()
  307.             name = s.strip()
  308.             if (i + len(s)) == n:
  309.                 return None, -1  # end of buffer
  310.             return string.lower(name), m.end()
  311.         else:
  312.             self.updatepos(declstartpos, i)
  313.             self.error("expected name token")
  314.  
  315.     # To be overridden -- handlers for unknown objects
  316.     def unknown_decl(self, data):
  317.         pass
  318.